from BasicLibraries import *
import utils_nc as ut


'''
Functions to make the dataset
'''

#================================================================================
#================================================================================

def get_main_dataset():
    db = pd.read_csv('local_data/LC_dataset_v_1_1C.csv')
    #=== Here we only look at the period 2010-2020
    db = db[db.rfyear > 2009]

    ### Get environmental dataset
    environmental_sdgs = [6,7,11,12,13,14,15]
    social_sdgs = [1,2,3,4,5,8,9,10,16,17]
    
    ### Divide environmental from Social Data
    envKeys = ut.utils().get_SpecificSDGsColumns(environmental_sdgs)
    socialKeys = ut.utils().get_SpecificSDGsColumns(social_sdgs)


    envData = db.drop(columns = socialKeys)
    envData.loc[:, 'Environmental SDGs'] = envData[envKeys].sum(axis = 1)
  

    socialData =  db.drop(columns = envKeys)
    socialData.loc[:,'Socio-economic SDGs'] = socialData[socialKeys].sum(axis = 1)
    
    return db, envData, envKeys, socialData, socialKeys

def MergeWithAccounting(X, da):
    da['mrg'] = ut.utils().make_mrg(da, 'fyear')
    return(X.merge(da.drop(columns = ['fyear', 'gvkey', 'GICS_level_1', 'loc']), on = 'mrg'))
def text_data_fro_topic_analysis():
    ta, sentences = pickle.load(open('SDGRevisited/SDG_revisited.pckl', 'rb'))
    ta2, sentences2 = pickle.load(open('ClimateSDGs/climate_actions_update.pckl', 'rb'))
    sentences = pd.concat((sentences, sentences2))
    sentences = sentences.groupby(sentences.index).first()
    target_type = 'below'
    un = make_universe_targets_2020(2019, target_type = target_type)
    un = un[un.horizon_year == 2025]
    aligned = un[un.signed == -1] 
    misaligned = un[un.signed == 1]
    alignment = pd.concat((aligned[['alignment', 'gvkey']],misaligned[['alignment', 'gvkey']]))
    
    paris_text = sentences.merge(alignment)   
    paris_text.to_csv('local_data/sentences_for_topic_analysis.csv')
def MakeReducedData(X, envSDGs):
    '''
    Include only climate related actions
    '''
    ReducedData = X.copy()
    ta, sentences = pickle.load(open('ClimateSDGs/climate_actions.pckl', 'rb'))
    ta2, sentences2 = pickle.load(open('ClimateSDGs/climate_actions_update.pckl', 'rb'))
    ta = pd.concat((ta, ta2))
    ta = ta.groupby(ta.index).first()
    ta = ta.reset_index()
    ta['json_filename'] = ta['index'].apply(lambda x: x.split('/')[-1])
    S = envSDGs.copy()
    relevant_keys = ut.utils().get_initiativeKeys(ut.utils().return_Initiatives(), S )
    ReducedData = ReducedData.drop(columns = relevant_keys)
    ta = ta[['json_filename'] + relevant_keys]
    ReducedData = ReducedData.merge(ta, on = 'json_filename')
    ReducedData['number_of_initiatives'] = ReducedData[relevant_keys].sum(axis  = 1)
    return ReducedData

#================================================================================
#================================================================================

def make_dataset(environmental_sdgs, PARIS=False, emissionProvider='TruCost'):
    #== Load Compustat
    print('Load COMPUSTAT')
    compO = pd.read_csv('local_data/Compustat_data.csv', low_memory=False)
    compO['mrg'] = ut.utils().make_mrg(compO, 'fyear')
    comp = pd.read_csv('local_data/Compustat_data_0423.csv', low_memory=False)
    comp = pd.concat((compO, comp))
    comp = comp.groupby('mrg').first().reset_index()
    comp['GICS_level_0'] = comp['GICS_level_1'].replace(['Information Technology', 'Communication Services'], ['ICT']*2)
    CRM  = pd.read_csv('local_data/CountryToRegionMapping.csv')
    CRM = CRM[['SP_GEOGRAPHY', 'iso_code']].groupby('iso_code').last().reset_index()
    CRM.columns = ['loc', 'MacroRegion']
    CRM = CRM.replace(['Latin America and Caribbean', 'Africa', 'Middle East'], ['Others']*3)
    comp = comp.merge(CRM)
    comp['Tangibility'] = comp['ppent_usd']/comp['at_usd']
    comp['Profitability'] = comp['ebitda_usd']/comp['at_usd']
    comp = comp.sort_values(by = ['gvkey', 'fyear'])
    comp['Total_invested_capital_BS1_lagged'] = comp[['gvkey', 'Total_invested_capital_BS1']].groupby('gvkey').shift(1)
    comp['investment_log'] = comp['Total_invested_capital_BS1_lagged'].apply(np.log)
    comp['Tangibility_H'] = comp[['Tangibility', 'gvkey']].groupby('gvkey').rolling(3).mean().values.ravel().tolist()
    comp['Profitability_H'] = comp[['Profitability', 'gvkey']].groupby('gvkey').rolling(3).mean().values.ravel().tolist()
    comp['firm_size_H'] = comp[['firm_size', 'gvkey']].groupby('gvkey').rolling(3).mean().values.ravel().tolist()
    comp['investment_log_H'] = comp[['investment_log', 'gvkey']].groupby('gvkey').rolling(3).mean().values.ravel().tolist()
    comp['mrg'] = ut.utils().make_mrg(comp, 'fyear')
    #=== TRUCOST
    print('Load TRUCOST')
    emissionO = pd.read_csv('local_data/TruCost.csv')
    emission = pd.read_csv('local_data/TruCost_DFT_0423.csv')
    emission = pd.concat((emissionO, emission))
    emission = emission.groupby('mrg').first().reset_index()
    emission = emission.rename(columns = {'SP_GEOGRAPHY': 'MacroRegion'})
    emission['MacroRegion'] = emission['MacroRegion'].replace(['Latin America and Caribbean', 'Africa', 'Middle East'], ['Others']*3)
    emission = emission.loc[emission.gvkey.dropna().index].reset_index(drop = True)
    emission['GICS_level_0'] = emission['GICS_level_1'].replace(['Information Technology', 'Communication Services'], ['ICT']*2)
    emission['mrg'] = emission['gvkey'].astype(int).astype(str)+'-'+emission['cyear'].astype(str)
    #=== Make the long term emission variable as the cumulative sum of the two years ahead emission
    emission = emission.sort_values(by = ['gvkey', 'cyear'])
    emissionVAR = 'DirectControl'
    lagsEM = 3
    for n in range(1, lagsEM):
        emission['emission_lag_'+str(n)] = emission[['gvkey', emissionVAR]].groupby('gvkey').shift(-n)
    emission['emissionShortTerm'] = emission[[emissionVAR] + ['emission_lag_'+str(n) for n in range(1, 2)]].sum(axis = 1,skipna=False)
    emission['emissionLongTerm'] = emission[['emission_lag_'+str(n) for n in range(1, lagsEM)]].sum(axis = 1,skipna=False)
    emission['emissionLongTerm_log'] = emission['emissionLongTerm'].apply(np.log)
    emission['emissionShortTerm_log'] = emission['emissionShortTerm'].apply(np.log)
    emission['emission_log'] = emission[emissionVAR].apply(np.log)
    emission = emission.sort_values(by = ['gvkey', 'cyear'])
    emission['DirectControl_H'] = emission[['DirectControl', 'gvkey']].groupby('gvkey').rolling(3).mean().values.ravel().tolist()
    emission = emission.groupby('mrg').last().reset_index()
    TRUCOST = emission.copy()
    
    #=== REFINITIV
    REFINITIV=0
    

    print('Load LCBS')

    #== Get main dataset
    dbFL, envData, envKeys, socialData, socialKeys  =  get_main_dataset(environmental_sdgs)
    #=== Keep the full dataset somewhere
    FullData = dbFL.copy()
    #=== This is an extremly important point, from now on db is the environmental data
    #=== For this project, the database is just the envrionmental SDGs
    db = envData.copy()
    db['MacroRegion'] = db['MacroRegion'].replace(['Latin America and Caribbean', 'Africa', 'Middle East'], ['Others']*3)
    #=== Clean up names
    db = db.drop(columns = ['number_of_initiatives']).rename(columns ={'Environmental SDGs': 'number_of_initiatives'})
    db['GICS_level_0'] = db['GICS_level_0'].replace(['Consumer Discretionary', 'Consumer Staple'], ['Cons. Disc.', 'Cons. Staple'])
    SECTORS = db.GICS_level_0.unique()[:-1]
    

    db = db[db.rfyear < 2022]
    FullData = FullData[FullData.rfyear < 2022]
    
    
    #== Put TruCost and LCBS together with cleaning
    data = db.merge(TRUCOST[[emissionVAR, 'DirectControl_H', 'emission_log',  'emissionLongTerm', 'emissionShortTerm',  'emissionLongTerm_log', 'emissionShortTerm_log', 'mrg']], on = 'mrg')

    #== Merge with accounting data
    print('Merge LCBS+TRUCOST+COMPUSTAT')
    Z = MergeWithAccounting(data.drop(columns = ['GICS_level_0', 'GICS_level_2', 'MacroRegion']), comp)
    Z = Z.replace([-np.inf, np.inf], [np.nan, np.nan])
    CleanData = Z.copy()
    CleanData = CleanData.sort_values(by = ['gvkey', 'rfyear'])
    CleanData['Total_invested_capital_BS1_lagged'] = CleanData[['gvkey', 'Total_invested_capital_BS1']].groupby('gvkey').shift(1)

    #== Replace the SDGs with only emission-related initiatives
    print('Retain only GHG-related initiatives')
    ReducedData = MakeReducedData(CleanData, environmental_sdgs)
    ReducedData = ReducedData.groupby('mrg').last().reset_index()


    return db, FullData, SECTORS, data, CleanData, ReducedData, TRUCOST, REFINITIV, comp


def make_reports_characteristics():
    obs = pd.read_csv('local_data/refinitiv_csr_reporting_0423.csv')
    obs = obs[['fyear', 'gvkey', 'GRI_Report_Guidelines', 'CSR_Sustainability_Report_Global_Activities', 'CSR_Sustainability_External_Audit']]
    obs['fyear'] = obs['fyear'].apply(lambda x: str(x).split('-')[0])
    obs['mrg'] = ut.utils().make_mrg(obs, 'fyear')
    obs = obs.groupby('mrg').last().reset_index()
    obs['GRI_Report_Guidelines'] = obs['GRI_Report_Guidelines'].replace([False, True, 'False', 'True', '0', '1'], [0,1,0,1,0,1])
    obs['CSR_Sustainability_Report_Global_Activities'] = obs['CSR_Sustainability_Report_Global_Activities'].replace([False, True, 'False', 'True', '0', '1'], [0,1,0,1,0,1])
    obs['CSR_Sustainability_External_Audit'] = obs['CSR_Sustainability_External_Audit'].replace([False, True, 'False', 'True', '0', '1'], [0,1,0,1,0,1])
    obs.to_csv('local_data/reports_characteristics.csv')

    
def make_universe_targets_2020(year = 2019, target_type = 'well_below'):
    #== Get the TruCost Alignment Data
    unA =  pd.read_csv('local_data/ParisAlignment.csv').drop(columns = ['Unnamed: 0'])
    unA = unA[unA.cyear == 2018]
    unB = pd.read_csv('local_data/ParisAlignment_2020.csv').drop(columns = ['Unnamed: 0'])
    un = pd.concat((unA, unB))
    un = un.drop(columns = ['SP_COMPANY_STATUS'])
    un = un[un.cyear == year]
    un['mrg'] = ut.utils().make_mrg(un, 'cyear')
    un = un.groupby('mrg').last().reset_index()
    #== Get alignment and misalignment groups [choose the type of target]
    un['signed'] = un[target_type].apply(np.sign)
    un['alignment'] = un['signed'].replace([-1,1], ['Aligned', 'Misaligned'])

    return un 